{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "dict_keys(['sex', 'gambling'])"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import json\n",
    "\n",
    "with open('nsfw.json') as fopen:\n",
    "    data = json.load(fopen)\n",
    "    \n",
    "data.keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "labels = ['sex', 'gambling', 'negative']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "64634949"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "with open('dumping-cleaned-common-crawl.txt') as fopen:\n",
    "    negative = list(filter(None, fopen.read().split('\\n')))\n",
    "    \n",
    "len(negative)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Europa Casino - Ulasan | OnlineCasinoReports Malaysia',\n",
       " 'Kasino',\n",
       " 'Kasino yang Diketengahkan',\n",
       " 'Kasino terkemuka',\n",
       " 'Kasino baru',\n",
       " 'Kasino yang dikemaskini',\n",
       " 'Kasino bergerak',\n",
       " 'Kasino Bitcoin',\n",
       " 'Jackpot progresif',\n",
       " 'Permainan kasino percuma',\n",
       " 'Bonus Kasino Eksklusif',\n",
       " 'Bonus Petaruh Besar',\n",
       " 'Pakej Bonus Kasino',\n",
       " 'Bonus Kasino Bergerak',\n",
       " 'Slot Dalam Talian',\n",
       " 'Slot Dalam Talian Percuma',\n",
       " 'Slot Wang Sebenar',\n",
       " 'Blackjack Dalam Talian',\n",
       " 'Blackjack Dalam Talian Percuma',\n",
       " 'Blackjack Dengan Wang Sebenar',\n",
       " 'Craps Dalam Talian',\n",
       " 'Craps Dalam Talian Percuma',\n",
       " 'Craps Dengan Wang Sebenar',\n",
       " 'Rolet Dalam Talian',\n",
       " 'Rolet Dalam Talian Percuma',\n",
       " 'Rolet Dengan Wang Sebenar',\n",
       " 'Poker Video Dalam Talian',\n",
       " 'Poker Video Dalam Talian Percuma',\n",
       " 'Poker Video Dengan Wang Sebenar',\n",
       " 'Poker Dalam Talian',\n",
       " 'Poker Dalam Talian Percuma',\n",
       " 'Poker Dengan Wang Sebenar',\n",
       " 'Kasino Dalam Talian',\n",
       " 'Europa Casino',\n",
       " 'Bonus Petaruh Baru!',\n",
       " '#67 of 93 Kasino Dalam Talian',\n",
       " 'URL: europacasino.com',\n",
       " 'Petaruh besar',\n",
       " 'Ciri-ciri: Download, Instant, Live Dealers, Progressive Jackpots, VIP',\n",
       " 'Europa Casino Ulasan',\n",
       " 'OnlineCasinoReportsUlasan terakhir dikemaskini pada July 30, 2017',\n",
       " 'Petaruh-petaruh dalam talian di Europa Casino boleh menikmati pelbagai permainan dalam talian yang mendebarkan di samping menyeronokkan.',\n",
       " 'Europa Casino mengendalikan sebuah platform permainan yang didokong oleh Playtech di mana terdapat lebih kurang 100 jenis permainan yang tersenarai di dalam pelbagai kategori yang menyeronokkan.',\n",
       " 'Ini termasuklah permainan-permainan slot, meja, poker dan permainan-permainan lain yang tidak kurang hebatnya.',\n",
       " 'Kategori-kategori permainan di Europa Casino menawarkan:',\n",
       " 'Blackjack',\n",
       " 'Craps',\n",
       " 'Rolet',\n",
       " 'Poker Video',\n",
       " 'Berkenaan dengan khidmat bantuan pelanggan, Europa Casino bekerja keras untuk menjaga hubungan yang baik dengan para pelanggannya.',\n",
       " 'Tiada yang lebih digemari oleh pelanggan apabila bermain di mana-mana kasino dalam talian selain daripada peluang untuk mendapatkan peratusan wang kemenangan yang tinggi, dan Europa Casino pula betul-betul menepati kehendak utama pelanggan.',\n",
       " 'Europa Casino memberikan 97.28% pembayaran bagi wang kemenangan, dan ini bermakna para pemain dalam talian boleh bertaruh dengan penuh keyakinan.',\n",
       " 'Ciri-ciri tambahan yang terdapat di Europa Casino termasuklah:',\n",
       " 'Bukan semua bahasa di benua Eropah terdapat di dalam Europa Casino, oleh itu Europa Casino nampaknya menawarkan permainan-permainan yang kurang dari segi kepelbagaian jika dibandingkan dengan kasino-kasino dalam talian terkemuka yang lain.',\n",
       " 'Keterujaan Permainan Meja Langsung di EuroGrand Casino',\n",
       " 'Slots Magic Menawarkan Sehingga $2500 Kepada Petaruh Baru',\n",
       " 'Slot Istimewa: Permainan Paling Popular dan Kemenangan Terbesar',\n",
       " 'Slots Club bet365 Casino Membawakan Yang Terhangat',\n",
       " 'Online Casino Reports',\n",
       " 'OnlineCasinoReports Malaysia',\n",
       " 'onlinecasinoreports.com.my ©2019',\n",
       " 'Judi Ceme Poker Online - *Qiuceme adalah situs judi ceme poker online Indonesia uang asli yang terbaik dengan deposit cuma 20rb rupiah dan bonus bonus yang fantastis sampai beberap...',\n",
       " 'Judi Ceme Poker Online - *Qiuceme adalah situs judi ceme poker online Indonesia uang asli yang terbaik dengan deposit cuma 20rb rupiah dan bonus bonus yang fantastis sampai beberap...',\n",
       " 'Judi Ceme Poker Online - *Qiuceme adalah situs judi ceme poker online Indonesia uang asli yang terbaik dengan deposit cuma 20rb rupiah dan bonus bonus yang fantastis sampai beberap...',\n",
       " 'Judi Ceme Poker Online - *Qiuceme adalah situs judi ceme poker online Indonesia uang asli yang terbaik dengan deposit cuma 20rb rupiah dan bonus bonus yang fantastis sampai beberap...',\n",
       " 'Tobias Jackpotcity Casino Test (2)',\n",
       " 'Judi Ceme Poker Online - *Qiuceme adalah situs judi ceme poker online Indonesia uang asli yang terbaik dengan deposit cuma 20rb rupiah dan bonus bonus yang fantastis sampai beberap...',\n",
       " 'Judi Ceme Poker Online - *Qiuceme adalah situs judi ceme poker online Indonesia uang asli yang terbaik dengan deposit cuma 20rb rupiah dan bonus bonus yang fantastis sampai beberap...',\n",
       " 'Kod Bonus Kasino Online Segera untuk Gambler Afrika Selatan - Kod Bonus Casino Online',\n",
       " 'Kasino',\n",
       " 'Blackjack',\n",
       " 'Poker',\n",
       " 'Tapak Kasino dalam talian Argentina',\n",
       " 'Tapak Kasino Dalam Talian Armenia',\n",
       " 'Tapak Kasino dalam talian Austria',\n",
       " 'Tapak Kasino dalam Bahasa Azerbaijan',\n",
       " 'Tapak Kasino dalam talian Belgium',\n",
       " 'Tapak Kasino Bermuda Online',\n",
       " 'Tapak Kasino dalam talian Bolivia',\n",
       " 'Tapak Kasino dalam talian Bosnia dan Herzegovina',\n",
       " 'Tapak Kasino Dalam Talian Brazil',\n",
       " 'Tapak Kasino Bulgaria Online',\n",
       " 'Tapak Kasino Dalam Talian Cina',\n",
       " 'Tapak Kasino dalam talian Czech',\n",
       " 'Tapak Kasino dalam talian Denmark',\n",
       " 'Tapak Kasino Dalam Talian Belanda',\n",
       " 'Tapak Kasino Online Estonia',\n",
       " 'Tapak Kasino dalam talian Finland',\n",
       " 'Tapak Kasino Dalam Talian Perancis',\n",
       " 'Tapak Kasino Online Georgia',\n",
       " 'Tapak Kasino dalam talian Jerman',\n",
       " 'Tapak Kasino dalam talian Greek',\n",
       " 'Tapak Kasino Dalam Talian Iceland',\n",
       " 'Tapak Kasino Dalam Talian India',\n",
       " 'Tapak Kasino Dalam Talian Indonesia',\n",
       " 'Tapak Kasino dalam talian Itali',\n",
       " 'Tapak Kasino Dalam Talian Jepun',\n",
       " 'Tapak Kasino Dalam Talian Korea',\n",
       " 'Tapak Kasino Dalam Talian Latvia',\n",
       " 'Tapak Kasino dalam talian Macedonian',\n",
       " 'Tapak Kasino Dalam Talian Melayu',\n",
       " 'Tapak Kasino dalam talian Malta',\n",
       " 'Tapak Kasino Dalam Talian Norway',\n",
       " 'Tapak Kasino dalam Bahasa Portugis',\n",
       " 'Tapak Kasino dalam Bahasa Romania',\n",
       " 'Tapak Kasino Dalam Talian Serbian',\n",
       " 'Tapak Kasino dalam talian Slovak',\n",
       " 'Tapak Kasino Slovenia Online',\n",
       " 'Tapak Kasino Dalam Talian Afrika Selatan',\n",
       " 'Tapak Kasino dalam Bahasa Sepanyol',\n",
       " 'Tapak Kasino dalam talian Sweden',\n",
       " 'Tapak Kasino Uzbekistan Online',\n",
       " 'Tapak Kasino Dalam Talian Vietnam',\n",
       " 'Oleh Casino',\n",
       " 'Poker video',\n",
       " 'Bonus Casino',\n",
       " 'Kasino mengikut Negeri',\n",
       " 'Kasino dalam talian oleh Conutry',\n",
       " 'Bonus Mengikut Kasino',\n",
       " 'Video Kasino Rollers Tinggi',\n",
       " 'Kod Bonus Kasino Dalam Talian > Kod Bonus Kasino Online Segera untuk Gambler Afrika Selatan',\n",
       " 'Kod Bonus Kasino Online Segera untuk Gambler Afrika Selatan',\n",
       " 'Kod Bonus Casino Online untuk Afrika Selatan',\n",
       " 'Kelebihan Memilih Kasino Afrika SelatanMemasukkannya pada dasarnya, dengan menggunakan kelab kedai di Afrika Selatan sebagai kaedah untuk mendekati sebuah kelab perjudian yang tersusun terutamanya berdasarkan keperluan anda.',\n",
       " 'Top 10 Best Eropah Casinos Online 2018:',\n",
       " 'bermain Casino',\n",
       " 'Bermain 888 Casino',\n",
       " 'Bermain Bwin Casino',\n",
       " 'Bermain 777 Casino',\n",
       " '100 putaran percuma di Kasino Casumo',\n",
       " 'Main Casino Casumo',\n",
       " 'Bermain Casino Parti',\n",
       " 'Bermain Jackpot City Casino',\n",
       " 'Bermain Casino Com',\n",
       " 'Bermain SlotoCash Casino',\n",
       " 'Bermain WinkSlots Casino',\n",
       " 'Bermain SpinPalace Casino',\n",
       " 'Bermain Mansion Casino',\n",
       " 'Bermain SlotsHeaven Casino',\n",
       " 'Top 10 Best USA Casinos Online 2018:',\n",
       " 'Bermain Lincoln Casino',\n",
       " 'Mainkan RedStag Casino',\n",
       " 'Main VegasCrest Casino',\n",
       " 'Main Casino LibertySlots',\n",
       " '$ 3,750 Casino Bonus Selamat datang',\n",
       " 'Bermain Bovada Casino',\n",
       " 'MENGGUNAKAN KOD COUPON: CASINO400',\n",
       " 'Bermain Planet7 Casino',\n",
       " 'jackpots progresif',\n",
       " 'Bermain LasVegas USA Casino',\n",
       " 'Bermain SilverOak Casino',\n",
       " 'Bermain SlotsPlus Casino',\n",
       " 'Bermain BetOnline Casino',\n",
       " 'Bonus kasino percuma:',\n",
       " '125 membebaskan tiada bonus deposit di LVbet Casino',\n",
       " '25 putaran percuma bonus di LuckyDino Casino',\n",
       " 'VEGAS CASINO ONLINE BERMULA DENGAN $ 25 CHIP',\n",
       " '80 tiada bonus kasino deposit di PocketFruity Casino',\n",
       " '120 tiada bonus deposit di WickedJackpots Casino',\n",
       " '125 putaran percuma bonus di Betbright Casino',\n",
       " '105 putaran percuma di BitStarz Casino',\n",
       " '60 putaran percuma bonus di Winorama Casino',\n",
       " '65 membebaskan tiada bonus deposit di MarathonBet Casino',\n",
       " '105 tiada bonus kasino deposit di Insta Casino',\n",
       " '15 putaran percuma bonus kasino di Sverige Casino',\n",
       " '65 tiada bonus kasino deposit di Winzino Casino',\n",
       " '€ 810 Percuma Kejohanan Kasino',\n",
       " 'Bonus Kasino Signup 640%',\n",
       " 'EUR 155 Free Chip Casino',\n",
       " '€ 22 Online Casino Tournament',\n",
       " 'Kejohanan Kasino 777 Eur',\n",
       " 'Tiket 205 Casino Percuma Eur',\n",
       " 'Kejohanan Kasino 915 Online',\n",
       " '45 Free Spins tiada kasino deposit',\n",
       " '40 percuma berputar tanpa kasino deposit',\n",
       " 'Bonus Oleh Kasino:',\n",
       " 'Encik Green Casino',\n",
       " 'RoyalPanda Casino',\n",
       " 'Flamantis Casino',\n",
       " 'Intragame Casino',\n",
       " 'Boss Casino',\n",
       " 'Casino SuperGaminator',\n",
       " 'Kasino SekaBet',\n",
       " 'Casino tenaga',\n",
       " 'Peraturan Kasino',\n",
       " 'Noxwin Casino',\n",
       " 'Kasino LVbet',\n",
       " 'Kasino Bordeaux',\n",
       " 'Slots500 Casino',\n",
       " 'Perdana Casino',\n",
       " 'Casino PrimeFortune',\n",
       " 'SimbaGames Casino',\n",
       " 'Casino Mega',\n",
       " 'Casino PrimeScratchCards',\n",
       " 'Casino PrimeSlots',\n",
       " 'BGO Casino',\n",
       " 'Iw Casino',\n",
       " 'Ladbrokes Casino',\n",
       " 'Winorama Casino',\n",
       " 'Gratorama Casino',\n",
       " 'Scratch Mania Casino',\n",
       " 'Winspark Casino',\n",
       " 'KaRaMbA Casino',\n",
       " 'Hopa Casino',\n",
       " '21Prive Casino',\n",
       " '24hBet Casino',\n",
       " 'Kasino 24Bettle',\n",
       " '7Red Casino',\n",
       " 'Kasino 888ladies',\n",
       " 'Aha Casino',\n",
       " 'Adler Casino',\n",
       " 'AllAustralian Casino',\n",
       " 'AllBritish Casino',\n",
       " 'AllIrish Casino',\n",
       " 'Amsterdams Casino',\n",
       " 'Anna Casino',\n",
       " 'Argo Casino',\n",
       " 'Kasino Bertil',\n",
       " 'Bet365 Vegas Casino',\n",
       " 'Betotto Casino',\n",
       " 'Betbright Casino',\n",
       " 'BetChan Casino',\n",
       " 'Bethard Casino',\n",
       " 'Kasino Betul',\n",
       " 'Betsafe Casino',\n",
       " 'BetSpin Casino',\n",
       " 'Betsson Casino',\n",
       " 'BettingWays Casino',\n",
       " 'Bingo Casino',\n",
       " 'BitStarz Casino',\n",
       " 'BlingCity Casino',\n",
       " 'Bohemia Casino',\n",
       " 'Kasino Buck & Butler',\n",
       " 'Buzz Casino',\n",
       " 'BuzzSlots Casino',\n",
       " 'Casino Caribic',\n",
       " 'Cashmio Casino',\n",
       " 'Casino Cruise',\n",
       " 'Estrella Casino',\n",
       " 'Kasino Euro',\n",
       " 'Kasino tambahan',\n",
       " 'Promosi Page Casino',\n",
       " 'Casino Heroes',\n",
       " 'Kasino huone',\n",
       " 'Jefe Casino',\n",
       " 'Nasib kasino',\n",
       " 'Noir Casino',\n",
       " 'Bilik kasino',\n",
       " 'Stugan kasino',\n",
       " 'Casumo Casino',\n",
       " 'Kasino ChanceHill',\n",
       " 'Cherry Casino',\n",
       " 'Comeon Casino',\n",
       " 'CrazyScratch Casino',\n",
       " 'CyberClub Casino',\n",
       " 'Dansk777 Casino',\n",
       " 'Kasino Devilfish',\n",
       " 'Kasino Dhoze',\n",
       " 'Diamond7 Casino',\n",
       " 'DiamondWorld Casino',\n",
       " 'Dragonara Casino',\n",
       " 'Casino DrVegas',\n",
       " 'Kasino Dublinbet',\n",
       " 'Dunder Casino',\n",
       " 'Kasino Euro Bet',\n",
       " 'EuroLotto Casino',\n",
       " 'EuroSlots Casino',\n",
       " 'ExtraSpel Casino',\n",
       " 'Kasino Finlandia',\n",
       " 'Florijn Casino',\n",
       " 'FreeSpins Casino',\n",
       " 'GDay Casino',\n",
       " 'GDFplay Casino',\n",
       " 'Casino Glorious',\n",
       " 'Kasino emas',\n",
       " 'GoldClub Casino',\n",
       " 'Kasino GoldSpins',\n",
       " 'Kasino Gossip',\n",
       " 'GrandGames Casino',\n",
       " 'Hello Casino',\n",
       " 'Kasino Hertotto',\n",
       " 'Hey Casino',\n",
       " 'Casino Igame',\n",
       " 'Insta Casino',\n",
       " 'Joy Casino',\n",
       " 'Kaboo Casino',\n",
       " 'Karl Casino',\n",
       " 'Kasino Kolikkopelit',\n",
       " 'Kroon Casino',\n",
       " 'Kultakaivos Casino',\n",
       " 'Kasino LeijonaKasino',\n",
       " 'Leovegas Casino',\n",
       " 'Lucky31 Casino',\n",
       " 'LuckyDino Casino',\n",
       " 'Kasino MagicalVegas',\n",
       " 'MainStage Casino',\n",
       " 'Mamamia Casino',\n",
       " 'MarathonBet Casino',\n",
       " 'Maria Casino',\n",
       " 'MaxiPlay Casino',\n",
       " 'Mobilbet Casino',\n",
       " 'Casino MondoFortuna',\n",
       " 'MoonBingo Casino',\n",
       " 'Casino MoonGames',\n",
       " 'Casino halaman pendaratan',\n",
       " 'MrRingo Casino',\n",
       " 'MrSmith Casino',\n",
       " 'MrSpill Casino',\n",
       " 'Mybet Casino',\n",
       " 'DI SINI Kasino',\n",
       " 'MyJackpot Casino',\n",
       " 'Nederbet Casino',\n",
       " 'Nettiarpa Casino',\n",
       " 'Netti Casino',\n",
       " 'Casino Seterusnya',\n",
       " 'NoBonus Casino',\n",
       " 'Nordicbet Casino',\n",
       " 'NordicSlots Casino',\n",
       " 'Norges Automaten Casino',\n",
       " 'Norges Casino',\n",
       " 'Norgesspill Casino',\n",
       " 'NorgeVegas Casino',\n",
       " 'NorskeAutomater Casino',\n",
       " 'Norskelodd Casino',\n",
       " 'Kasino OkScratchcards',\n",
       " 'Kasino Optibet',\n",
       " 'PAF Casino',\n",
       " 'Kasino Paris',\n",
       " 'Playamo Casino',\n",
       " 'PlayFrank Casino',\n",
       " 'Main Hippo Casino',\n",
       " 'PocketFruity Casino',\n",
       " 'Pokies Casino',\n",
       " 'Polder Casino',\n",
       " 'Kasino Polo',\n",
       " 'RealDealBet Casino',\n",
       " 'Redbet Casino',\n",
       " 'Casino Slot Merah',\n",
       " 'ReelIssland Casino',\n",
       " 'Rizk Casino',\n",
       " 'RobinHood Casino',\n",
       " 'Kasino Royaal',\n",
       " 'Royalbloodclub Casino',\n",
       " 'Scratch2cash Casino',\n",
       " 'Kasino sutera',\n",
       " 'Kasino SirJackpot',\n",
       " 'SlotJoint Casino',\n",
       " 'SlotsandGames Casino',\n",
       " 'SlottyVegas Casino',\n",
       " 'Jadi Casino',\n",
       " 'Kasino spectra',\n",
       " 'Kasino SpilleAutomater',\n",
       " 'Spinson Casino',\n",
       " 'Casino SpinStation',\n",
       " 'Kasino Gula',\n",
       " 'Kasino Stanjames',\n",
       " 'Star Casino',\n",
       " 'Casino Staybet',\n",
       " 'Suomiarvotto Casino',\n",
       " 'SuomiAutomaatti Casino',\n",
       " 'Suomikasino Casino',\n",
       " 'Suomi Vegas Casino',\n",
       " 'Casino SuperLenny',\n",
       " 'Kasino Sweden',\n",
       " 'Kasino Svea',\n",
       " 'Kasino Svenskalotter',\n",
       " 'Sverige Automaten Casino',\n",
       " 'Kasino Sverige',\n",
       " 'Sverige Kronan Casino',\n",
       " 'Keseronokan Casino',\n",
       " 'Kasino Tipbet',\n",
       " 'Casino Halaman Landing',\n",
       " 'TipTop Casino',\n",
       " 'TouchLucky Casino',\n",
       " 'Triobet Casino',\n",
       " 'Casino TonyBet',\n",
       " 'Unibet Casino',\n",
       " 'Wayne Casino',\n",
       " 'Kasino Whitebet',\n",
       " 'WickedJackpots Casino',\n",
       " 'Wink Casino Bingo',\n",
       " 'Winmasters Casino',\n",
       " 'WinTingo Casino',\n",
       " 'Winzino Casino',\n",
       " 'Kasino Verajohn',\n",
       " 'Casino VegasPlay',\n",
       " 'VegasSpins Casino',\n",
       " 'Casino Videolots',\n",
       " 'Kasino Viking Slots',\n",
       " 'VIPRoom Casino',\n",
       " 'Kasino Volt',\n",
       " 'Yako Casino',\n",
       " 'Casino Zone',\n",
       " 'Soartan Casino',\n",
       " 'Box24 Casino',\n",
       " 'Las Vegas USA Casino',\n",
       " 'Sun Palace Casino',\n",
       " '10Bet Casino',\n",
       " '188Bet Casino',\n",
       " '18Bet Casino',\n",
       " 'Kasino 1Bet2Bet',\n",
       " 'Kasino 1xBet',\n",
       " 'Kasino 1xBit',\n",
       " '21 Dukes Casino',\n",
       " 'Grand Casino 21',\n",
       " 'Kasino Prive 21',\n",
       " '21Nova Casino',\n",
       " 'Kasino 24 Pokies',\n",
       " '32Red Casino',\n",
       " '377Bet Casino',\n",
       " '50 Bintang Casino',\n",
       " '7 Gelendong Casino',\n",
       " '7 Sultan Casino',\n",
       " 'Kasino 77 Jackpot',\n",
       " 'Kasino 7Bit',\n",
       " '7Spins Casino',\n",
       " 'AC Casino',\n",
       " 'Ace Kingdom Casino',\n",
       " 'Ace Lucky Casino',\n",
       " 'Adamas Casino',\n",
       " 'Kasino Istana Afrika',\n",
       " 'AfriCasino',\n",
       " 'Aladdins Gold Casino',\n",
       " 'Semua Casino British',\n",
       " 'Semua Casino Ireland',\n",
       " 'Semua Jackpots Casino',\n",
       " 'Semua Casino Slot',\n",
       " 'All You Bet Casino',\n",
       " 'Sentiasa Vegas Casino',\n",
       " 'Kasino ares',\n",
       " 'Astralbet Casino',\n",
       " 'Kelab Kasino Atlantik',\n",
       " 'AuSlots Casino',\n",
       " 'Aunty Acid Casino',\n",
       " 'Azartmania Casino',\n",
       " 'Aztec Kekayaan Casino',\n",
       " 'Kasino Barbados',\n",
       " 'Bell Fruit Casino',\n",
       " 'Bella Vegas Casino',\n",
       " 'Casino Best',\n",
       " 'Bet otto Home Casino',\n",
       " 'Bet365 Casino',\n",
       " 'BetBright Casino',\n",
       " 'BetChain Casino',\n",
       " 'BetDNA Casino',\n",
       " 'BetHard Casino',\n",
       " 'BetJoy Casino',\n",
       " 'BetNCatch Casino',\n",
       " 'BetNSpin Casino',\n",
       " 'BetPhoenix Casino',\n",
       " 'Casino BetRally',\n",
       " 'BetUK Casino',\n",
       " 'BetVictor Casino',\n",
       " 'Betadonis Casino',\n",
       " 'Betchan Casino',\n",
       " 'Betclic Casino',\n",
       " 'Betfair Casino',\n",
       " 'Betfinal Casino',\n",
       " 'Betfred Casino',\n",
       " 'Casino Betive',\n",
       " 'Kasino Betser',\n",
       " 'Kasino Betspin',\n",
       " 'Betway Casino',\n",
       " 'Casino Dollar Besar',\n",
       " 'BIGBANG Casino',\n",
       " 'BitCasino.io',\n",
       " 'Bitstarz Casino',\n",
       " 'Casino Black Diamond',\n",
       " 'Casino Black Lotus',\n",
       " 'Ballroom Blackjack',\n",
       " 'Kasino Blue Lions',\n",
       " 'Kasino BoVegas',\n",
       " 'BoDog Kasino',\n",
       " 'Bogart Casino',\n",
       " 'Bonanza Game Casino',\n",
       " 'Kasino Boombet',\n",
       " 'Bovada Casino',\n",
       " 'BoyleSports Casino',\n",
       " 'Buck dan Butler Casino',\n",
       " 'Kasino Buran',\n",
       " 'Casino Burnbet',\n",
       " 'Buzz Slots Casino',\n",
       " 'BuzzLuck Casino',\n",
       " 'Bwin Casino',\n",
       " 'Bwin.it Casino',\n",
       " 'Cabaret Club Casino',\n",
       " 'Caesars Casino',\n",
       " 'Cafe Casino',\n",
       " 'Calvin Casino',\n",
       " 'Kapten Tukang masak Casino',\n",
       " 'Captain Jack Casino',\n",
       " 'Carotto Casino',\n",
       " 'Karnival Casino',\n",
       " 'Kasino Casdep',\n",
       " 'Kasino o Cash Lot',\n",
       " 'Cashino Casino',\n",
       " 'Kasino Cashpoint',\n",
       " 'Kasino Cashpot',\n",
       " 'Kasino Casibon',\n",
       " 'Casilando Casino',\n",
       " 'Kasino Casillion',\n",
       " 'Kasino Casinia',\n",
       " 'Kasino 21 Bet',\n",
       " 'Casino Tindakan',\n",
       " 'Casino Adrenaline',\n",
       " 'Casino Blu',\n",
       " 'Casino Bordeaux',\n",
       " 'Casino Brango',\n",
       " 'Casino British',\n",
       " 'Casino Calzone',\n",
       " 'Kasino Cash Palace',\n",
       " 'Casino Cerise',\n",
       " 'Casino klasik',\n",
       " 'Casino Club',\n",
       " 'Casino Del Rio',\n",
       " 'Casino Dukes',\n",
       " 'Casino Epoca',\n",
       " 'Casino Estrella',\n",
       " 'Casino Euro',\n",
       " 'Casino tetamu tambahan',\n",
       " 'Casino Extreme',\n",
       " 'Casino Fiz',\n",
       " 'Tingkat Casino',\n",
       " 'Casino Gates',\n",
       " 'Casino Grand Bay',\n",
       " 'Casino Hermes',\n",
       " 'Heroes Casino',\n",
       " 'Casino JEFE',\n",
       " 'Raja Kasino',\n",
       " 'Casino Kingdom',\n",
       " 'Casino La Riviera',\n",
       " 'Casino La Vida',\n",
       " 'Casino Las Vegas',\n",
       " 'Kasino loco',\n",
       " 'Casino Luck',\n",
       " 'Casino Magix',\n",
       " 'Mate Casino',\n",
       " 'Casino Midas',\n",
       " 'Bulan-bulan Casino',\n",
       " 'Casino Napoli',\n",
       " 'Casino Noir',\n",
       " 'Kasino Plex',\n",
       " 'RedKings Casino',\n",
       " 'Bilik Casino',\n",
       " 'Casino Royal Club',\n",
       " 'Casino Kongsi',\n",
       " 'Casino Sieger',\n",
       " 'Casino Splendido',\n",
       " 'Superlines kasino',\n",
       " 'Kasino Supreme Play',\n",
       " 'Casino Triomphe',\n",
       " 'Casino Tropez',\n",
       " 'Casino Venetian',\n",
       " 'Casino Ventura',\n",
       " 'Casino X',\n",
       " 'Casino of Mimpi',\n",
       " 'Casino.com',\n",
       " 'Casino.dk',\n",
       " 'Casino1 Club',\n",
       " 'CasinoCasino',\n",
       " 'CasinoMax',\n",
       " 'CasinoPop',\n",
       " 'CasinoSjov',\n",
       " 'CasinoStugan',\n",
       " 'CasinoVal',\n",
       " 'CasinoWin',\n",
       " 'Casinovo',\n",
       " 'Celtic Casino',\n",
       " 'Casino Challenge',\n",
       " 'Kasino Chance Hill',\n",
       " 'Chanz Casino',\n",
       " 'Cherry Gold Casino',\n",
       " 'Chomp Casino',\n",
       " 'Casino Circus.be',\n",
       " 'Casino awan',\n",
       " 'Kelab Dice Casino',\n",
       " 'Club Casino Gold',\n",
       " 'Kelab Pemain Casino',\n",
       " 'Club Sa Casino',\n",
       " 'Kelab Dunia Casino',\n",
       " 'Cocoa Casino',\n",
       " 'Codeta Casino',\n",
       " 'Kasino Falls Falls',\n",
       " 'Casino Palace Coin',\n",
       " 'Kasino Coliseumbet',\n",
       " 'Colosseum Casino',\n",
       " 'ComeOn Casino',\n",
       " 'SyarikatCasino',\n",
       " 'Conquer Casino',\n",
       " 'Kasino Cotto Percuma',\n",
       " 'Casino Main Percuma',\n",
       " 'Coral Casino',\n",
       " 'Casino Crazy',\n",
       " 'Crazy Luck Casino',\n",
       " 'Casino Crazy Spins',\n",
       " 'Crazy Vegas Casino',\n",
       " 'Casino Pemenang Gila',\n",
       " 'Casino Cristal Palace',\n",
       " 'Crystal Casino',\n",
       " 'Cyber \\u200b\\u200bClub Casino',\n",
       " 'Kasino Dafabet',\n",
       " 'Das Ist Casino',\n",
       " 'Kasino Dash',\n",
       " 'Davincis Casino Gold',\n",
       " 'Kasino Dazzle',\n",
       " 'Dendera Casino',\n",
       " 'Desert Nights Casino',\n",
       " 'Diamond Reels Casino',\n",
       " 'Diamond World Casino',\n",
       " 'Kasino DomGame',\n",
       " 'Dr Vegas Casino',\n",
       " 'Drake Casino',\n",
       " 'Dream Jackpot Casino',\n",
       " 'Dream Palace Casino',\n",
       " 'Dreams Casino',\n",
       " 'Dreamy Seven Casino',\n",
       " 'Casino DublinBet',\n",
       " 'EUcasino',\n",
       " 'Eotto Sleep Bet Casino',\n",
       " 'Eclipse Casino',\n",
       " 'Kasino Bergerak Elite',\n",
       " 'Emu Casino',\n",
       " 'Enzo Casino',\n",
       " 'Euro King Casino',\n",
       " 'Euro Palace Casino',\n",
       " 'Kasino EuroFortune',\n",
       " 'EuroGrand Casino',\n",
       " 'Kasino Eurobet',\n",
       " 'Euromoon Casino',\n",
       " 'Europa Casino',\n",
       " 'Everest Casino',\n",
       " 'Kasino Eksklusif Bet',\n",
       " 'Casino eksklusif',\n",
       " 'Expekt.com secara Casino',\n",
       " 'Kasino Fable',\n",
       " 'Casino Go Fair',\n",
       " 'Fairway Casino',\n",
       " 'Faustbet Casino',\n",
       " 'Fenix \\u200b\\u200bCasino',\n",
       " 'Kasino Fika',\n",
       " 'Fly Casino',\n",
       " 'Kasino FolkeAutomaten',\n",
       " 'Fone Casino',\n",
       " 'Fortune Frenzy Casino',\n",
       " 'Foxy Casino',\n",
       " 'Frank Casino',\n",
       " 'Freaky Aces Casino',\n",
       " 'Kasino Freaky Vegas',\n",
       " 'Percuma Spin Casino',\n",
       " 'Percuma Spins Casino',\n",
       " 'Fruity Casa Casino',\n",
       " 'Kasino Fruity King',\n",
       " 'Fun88 Casino',\n",
       " 'Futuriti Casino',\n",
       " 'Gala Casino',\n",
       " 'Casino Galaxy Pig',\n",
       " 'Kasino GaleMartin',\n",
       " 'Permainan Kelab Kasino',\n",
       " 'Gday Casino',\n",
       " 'Genting Casino',\n",
       " 'Dapatkan Casino Lucky',\n",
       " 'Gibson Casino',\n",
       " 'Kasino Gioco Digitale',\n",
       " 'Kasino Glimmer',\n",
       " 'Pergi Casino Wild',\n",
       " 'Casino GoWin',\n",
       " 'Casino Spins Emas',\n",
       " 'GoldBet Casino',\n",
       " 'Golden Euro Casino',\n",
       " 'Golden Lady Casino',\n",
       " 'Casino Golden Lion',\n",
       " 'Golden Lounge Casino',\n",
       " 'Golden Palace Casino',\n",
       " 'Golden Reef Casino',\n",
       " 'Golden Riviera Casino',\n",
       " 'Star Casino Golden',\n",
       " 'Golden Tiger Casino',\n",
       " 'Goldrun Casino',\n",
       " 'Selamat Hari 4 Play Casino',\n",
       " 'Gossip Slot Casino',\n",
       " 'Grand Eagle Casino',\n",
       " 'Grand Fortune Casino',\n",
       " 'Casino Grand Casino',\n",
       " 'Grand Hotel Casino',\n",
       " 'Grand Ivy Casino',\n",
       " 'Grand Mondial Casino',\n",
       " 'Grand Reef Casino',\n",
       " 'Grand Casino Wild',\n",
       " 'Grande Vegas Casino',\n",
       " 'Kasino Dog Hijau',\n",
       " 'Grosvenor Casino',\n",
       " 'keberanian Casino',\n",
       " 'HRwager Casino',\n",
       " 'Casino High Noon',\n",
       " 'Hippodrome Casino',\n",
       " 'Hippozino Casino',\n",
       " 'Rumah Jack Casino',\n",
       " 'Casino Slots yang besar',\n",
       " 'IGame Casino',\n",
       " 'INetBet Casino',\n",
       " 'Kasino Pencucuhan',\n",
       " 'Ikibu Casino',\n",
       " 'Imperial Casino',\n",
       " 'Kasino Indio',\n",
       " 'Indogvind Casino',\n",
       " 'InterCasino',\n",
       " 'Intertops Casino',\n",
       " 'Interwetten Casino',\n",
       " 'Intrabahis Casino',\n",
       " 'Luck Casino Ireland',\n",
       " 'Island Jackpots Casino',\n",
       " 'jackpot Capital',\n",
       " 'Jackpot Casino Tunai',\n",
       " 'Jackpot City Casino',\n",
       " 'Jackpot Casino Knights',\n",
       " 'Casino Jackpot Luck',\n",
       " 'Casino Jackpot Mobile',\n",
       " 'Jackpot Casino Paradise',\n",
       " 'Kasino Jackpot Wheel',\n",
       " 'Jackpot247 Casino',\n",
       " 'Casino Jackpotjoy',\n",
       " 'Jackpotstrike Casino',\n",
       " 'JetBull Casino',\n",
       " 'Kasino Jojobet',\n",
       " 'Joker Casino',\n",
       " 'Joo Casino',\n",
       " 'Joreels Casino',\n",
       " 'JoyCasino',\n",
       " 'Casino Juicy Stakes',\n",
       " 'Jumba Bet Casino',\n",
       " 'Musytari Club Casino',\n",
       " 'Kaiser Slots Casino',\n",
       " 'Kajot Casino',\n",
       " 'Kerching Casino',\n",
       " 'Billy Casino King',\n",
       " 'KingsWin Casino',\n",
       " 'Kasino konung',\n",
       " 'Casino Kudos',\n",
       " 'Kasino LSbet',\n",
       " 'LaRomere Casino',\n",
       " 'LadyLucks Casino',\n",
       " 'Lake Palace Casino',\n",
       " 'Kasino Lapalingo',\n",
       " 'Larry Casino',\n",
       " 'Leo Vegas Casino',\n",
       " 'Liberty Slot Casino',\n",
       " 'LimoPlay Casino',\n",
       " 'Lincoln Casino',\n",
       " 'Lion Slots Casino',\n",
       " 'Kasino Llama Gaming',\n",
       " 'Loki Casino',\n",
       " 'Lord Lucky Casino',\n",
       " 'Lotus Asia Casino',\n",
       " 'LuckLand Casino',\n",
       " 'Kasino Lucks',\n",
       " 'Lucky Bets Casino',\n",
       " 'Casino Club bertuah',\n",
       " 'Lucky Creek Casino',\n",
       " 'Maharaja Casino Lucky',\n",
       " 'Lucky Nugget Casino',\n",
       " 'Casino Lucky Merah',\n",
       " 'Lucky247 Casino',\n",
       " 'LuckyStar Casino',\n",
       " 'Casino Mewah',\n",
       " 'Madame Chance Casino',\n",
       " 'Kasino Magic Box',\n",
       " 'Magic Casino Merah',\n",
       " 'Magic Star Live Casino',\n",
       " 'Casino Magical Vegas',\n",
       " 'Kasino Magik',\n",
       " 'Majestic Slots Casino',\n",
       " 'Malibu Club Casino',\n",
       " 'Kasino Malina',\n",
       " 'Kasino Mandarin',\n",
       " 'Mansion Casino',\n",
       " 'Maple Casino',\n",
       " 'Casino Matchbook',\n",
       " 'Kasino Maxbet',\n",
       " 'Maxiplay Casino',\n",
       " 'Mayan Fortune Casino',\n",
       " 'Mega Casino DK',\n",
       " 'Megawins Casino',\n",
       " 'Kasino Merkur Win',\n",
       " 'Miami Club Casino',\n",
       " 'Kasino Miami Dice',\n",
       " 'Kasino Jutawan',\n",
       " 'Mission2Game Casino',\n",
       " 'Kasino Mobilautomaten',\n",
       " 'Casino MobileWins',\n",
       " 'Kasino Mobilebet',\n",
       " 'Mobireels Casino',\n",
       " 'Kasino Mobizino',\n",
       " 'MoboCasino',\n",
       " 'Mohegan Sun Casino',\n",
       " 'Kasino MoneyStorm',\n",
       " 'Kasino Monte Carlo',\n",
       " 'Moon Permainan Casino',\n",
       " 'Encik Mega Casino',\n",
       " 'Encik Mobi Casino',\n",
       " 'Encik Ringo Casino',\n",
       " 'Slot Casino',\n",
       " 'Encik Smith Casino',\n",
       " 'Encik Star Casino',\n",
       " 'Encik SuperPlay Casino',\n",
       " 'MrOyun Casino',\n",
       " 'Kasino Mucho Vegas',\n",
       " 'Mummys Casino Gold',\n",
       " 'Kasino Dewan Muzik',\n",
       " 'Kasino Saya',\n",
       " 'MyBet Casino',\n",
       " 'MyWin24 Casino',\n",
       " 'Kasino Permainan Gaming',\n",
       " 'NederBet Casino',\n",
       " 'Nedplay Casino',\n",
       " 'NetBet Casino',\n",
       " 'Tiada Casino Bonus',\n",
       " 'Casino Mulia',\n",
       " 'NogaBet Casino',\n",
       " 'Nordic Slot Casino',\n",
       " 'NordicBet Casino',\n",
       " 'NorgesCasino',\n",
       " 'NorgesSpill Casino',\n",
       " 'Kasino Lampu Utara',\n",
       " 'nostalgia Casino',\n",
       " 'Novibet Casino',\n",
       " 'NoxWin Casino',\n",
       " 'OG Palace Casino',\n",
       " 'Kasino OVO',\n",
       " 'Casino OceanBets',\n",
       " 'Havana Casino Lama',\n",
       " 'Olybet Casino',\n",
       " 'Omni Casino',\n",
       " 'Omni Slots Casino',\n",
       " 'OrientXpress Casino',\n",
       " 'Oshi Casino',\n",
       " 'Osiris Casino',\n",
       " 'Padi Kuasa Casino',\n",
       " 'Manjakan Casino',\n",
       " 'Pantasia Casino',\n",
       " 'Panther Casino',\n",
       " 'Syurga 8 Casino',\n",
       " 'Paradise Win Casino',\n",
       " 'Kasino Parasino',\n",
       " 'Paris VIP Casino',\n",
       " 'Park Lane Casino',\n",
       " 'Casino Parti',\n",
       " 'Casino Phoenicia',\n",
       " 'Kasino Pink',\n",
       " 'Planet 7 Casino',\n",
       " 'planet Casino',\n",
       " 'Planet Kings Casino',\n",
       " 'Platin Casino',\n",
       " 'Platinum Casino',\n",
       " 'Platinum Play Casino',\n",
       " 'Platinum Gelendong Casino',\n",
       " 'Bermain Casino Games',\n",
       " 'Bermain Cosmo Casino',\n",
       " 'Mainkan Fortuna Casino',\n",
       " 'Main Frank Casino',\n",
       " 'Play2Win Casino',\n",
       " 'Play7777 Casino',\n",
       " 'PlayClub Casino',\n",
       " 'Pemain Palace Casino',\n",
       " 'Casino Playgrand',\n",
       " 'Playhippo Casino',\n",
       " 'Banyak Jackpots',\n",
       " 'Kasino mewah',\n",
       " 'Pocket Casino',\n",
       " 'Pocket Casino EU',\n",
       " 'Poket Fruity Casino',\n",
       " 'PocketWin Casino',\n",
       " 'Pots of Luck Casino',\n",
       " 'Power Slots Casino',\n",
       " 'Casino Prism',\n",
       " 'Kasino PropaWin',\n",
       " 'Prospect Hall Casino',\n",
       " 'Casino Punch Bets',\n",
       " 'Quackpot Casino',\n",
       " 'Casino Quasar Gaming',\n",
       " 'Quatro Casino',\n",
       " 'Queen Vegas Casino',\n",
       " 'Raging Bull Casino',\n",
       " 'Ramses Gold Casino',\n",
       " 'Real Deal Bet Casino',\n",
       " 'Red Flush Casino',\n",
       " 'Red Casino Queen',\n",
       " 'Stag Red Casino',\n",
       " 'Red Star Casino',\n",
       " 'Reef Club Casino',\n",
       " 'Reel Island Casino',\n",
       " 'Gelendong Casino Spin',\n",
       " 'Reel Vegas Casino',\n",
       " 'Rembrandt Casino',\n",
       " 'Resorts Casino',\n",
       " 'Kasino Ricardos',\n",
       " 'Casino Kaya',\n",
       " 'Kaya Gelendong Casino',\n",
       " 'Kasino Riverbelle',\n",
       " 'Kasino Rivieraplay',\n",
       " 'Rockbet Casino',\n",
       " 'Roxy Palace Casino',\n",
       " 'Roy Richie Casino',\n",
       " 'RoyaalCasino',\n",
       " 'Royal Ace Casino',\n",
       " 'Royal Panda Casino',\n",
       " 'Royal Planet Casino',\n",
       " 'Casino Swipe Diraja',\n",
       " 'Royal Vegas Casino',\n",
       " 'Casino Jackpot Royale',\n",
       " 'Ruby Slot Casino',\n",
       " 'SCasino',\n",
       " 'Kasino Sahara Sands',\n",
       " 'Kasino Bilik Sapphire',\n",
       " 'Schmitts Casino',\n",
       " 'Kasino Slot Rahsia',\n",
       " 'Shadow Bet Casino',\n",
       " 'Silver Oak Casino',\n",
       " 'Kasino Silveredge',\n",
       " 'Silversands Casino',\n",
       " 'Simba Permainan Casino',\n",
       " 'Kasino Sin Spins',\n",
       " 'Sky Casino',\n",
       " 'Kasino Sky Vegas',\n",
       " 'Slingo Casino',\n",
       " 'Slot Boss Casino',\n",
       " 'Slot Fruity Casino',\n",
       " 'Slot Matic Casino',\n",
       " 'Slot Casino Kacang',\n",
       " 'Slot Planet Casino',\n",
       " 'Slot Casino Kuasa',\n",
       " 'SlotV Casino',\n",
       " 'Slotastic Casino',\n",
       " 'Slotland Casino',\n",
       " 'SlotoHit Casino',\n",
       " 'Slots Angel Casino',\n",
       " 'Casino Slots Capital',\n",
       " 'Slot Devil Casino',\n",
       " 'Slot Casino Syurga',\n",
       " 'Slot Casino Magic',\n",
       " 'Slot Casino Kampung',\n",
       " 'Slots.lv Casino',\n",
       " 'SlotsMillion Casino',\n",
       " 'SlotsMobile Casino',\n",
       " 'SlotsZoo Casino',\n",
       " 'Slotsmoon Casino',\n",
       " 'Slotter Casino',\n",
       " 'Slotty Vegas Casino',\n",
       " 'Casino Sloty',\n",
       " 'Kasino Bergerak Pintar',\n",
       " 'Kasino Smashing',\n",
       " 'Space Casino',\n",
       " 'SpaceLilly Casino',\n",
       " 'Spartan Slot Casino',\n",
       " 'Spin Fiesta Casino',\n",
       " 'Spin Genie Casino',\n",
       " 'Spin Palace Casino',\n",
       " 'Kasino Spin Princess',\n",
       " 'Spin Station Casino',\n",
       " 'Spin dan Menang Casino',\n",
       " 'Spinit Casino',\n",
       " 'Casino Spinland',\n",
       " 'Casino Spinprive',\n",
       " 'Casino Spinsvilla',\n",
       " 'Spinzilla Casino',\n",
       " 'Sportingbet Casino',\n",
       " 'Sukan Interaksi Casino',\n",
       " 'Springbok Casino',\n",
       " 'Stake7 Casino',\n",
       " 'Kasino Kasino',\n",
       " 'Stan James Kasino',\n",
       " 'Casino StarGames',\n",
       " 'Casino Starspins',\n",
       " 'Strike It Lucky Casino',\n",
       " 'Kasino Summit',\n",
       " 'Sunmaker Casino',\n",
       " 'Casino SunnyPlayer',\n",
       " 'SuomiKasino',\n",
       " 'SuperCasino',\n",
       " 'Casino Superior',\n",
       " 'supernova Casino',\n",
       " 'SveaCasino',\n",
       " 'Kasino SverigeKronan',\n",
       " 'SverigeCasino',\n",
       " 'SwedenCasino',\n",
       " 'TTR Casino',\n",
       " 'The Casino Maya',\n",
       " 'Thebes Casino',\n",
       " 'Kasino Thunderbolt',\n",
       " 'Times Square Casino',\n",
       " 'Titan Casino',\n",
       " 'Titanbet IT Casino',\n",
       " 'TopBet Casino',\n",
       " 'Jumlah Emas Kasino',\n",
       " 'ToteSport Casino',\n",
       " 'Sentuh Lucky Casino',\n",
       " 'Sentuh Casino Mudah Alih',\n",
       " 'Trada Casino',\n",
       " 'Tradisi Casino',\n",
       " 'Treasure Mile Casino',\n",
       " 'Tropezia Palace Casino',\n",
       " ...]"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data['gambling']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import random\n",
    "\n",
    "negative_sample = random.sample(negative, 6000000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "texts = data['sex'] + data['gambling'] + negative_sample\n",
    "Y = [0] * len(data['sex']) + [1] * len(data['gambling']) + [2] * len(negative_sample)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "rules_normalizer = {\n",
    "    'sapa': 'siapa',\n",
    "    'tq': 'terima kasih',\n",
    "    'ty': 'terima kasih',\n",
    "    'january': 'januari',\n",
    "    'february': 'februari',\n",
    "    'march': 'mac',\n",
    "    'may': 'mei',\n",
    "    'june': 'jun',\n",
    "    'july': 'julai',\n",
    "    'august': 'ogos',\n",
    "    'october': 'oktober',\n",
    "    'december': 'disember',\n",
    "    'dec': 'dis',\n",
    "    'oct': 'okt',\n",
    "    'monday': 'isnin',\n",
    "    'mon': 'isn',\n",
    "    'tuesday': 'selasa',\n",
    "    'tues': 'sel',\n",
    "    'wednesday': 'rabu',\n",
    "    'wed': 'rab',\n",
    "    'thursday': 'khamis',\n",
    "    'thurs': 'kha',\n",
    "    'friday': 'jumaat',\n",
    "    'fri': 'jum',\n",
    "    'saturday': 'sabtu',\n",
    "    'sat': 'sab',\n",
    "    'sunday': 'ahad',\n",
    "    'sun': 'ahd',\n",
    "    'gemen': 'kerajaan',\n",
    "    'camtu': 'macam itu',\n",
    "    'experience': 'pengalaman',\n",
    "    'kpd': 'kepada',\n",
    "    'bengng': 'bengang',\n",
    "    'mntak': 'minta',\n",
    "    'bagasi': 'bagasi',\n",
    "    'kg': 'kampung',\n",
    "    'kilo': 'kilogram',\n",
    "    'g': 'pergi',\n",
    "    'grm': 'gram',\n",
    "    'k': 'okay',\n",
    "    'abgkat': 'abang dekat',\n",
    "    'abis': 'habis',\n",
    "    'ade': 'ada',\n",
    "    'adoi': 'aduh',\n",
    "    'adoii': 'aduh',\n",
    "    'aerodarat': 'kapal darat',\n",
    "    'agkt': 'angkat',\n",
    "    'ahh': 'ah',\n",
    "    'ailior': 'air liur',\n",
    "    'airasia': 'air asia x',\n",
    "    'airasiax': 'penerbangan',\n",
    "    'airline': 'penerbangan',\n",
    "    'airlines': 'penerbangan',\n",
    "    'airport': 'lapangan terbang',\n",
    "    'airpot': 'lapangan terbang',\n",
    "    'aje': 'sahaja',\n",
    "    'ajelah': 'sahajalah',\n",
    "    'ajer': 'sahaja',\n",
    "    'ak': 'aku',\n",
    "    'aq': 'aku',\n",
    "    'all': 'semua',\n",
    "    'ambik': 'ambil',\n",
    "    'amek': 'ambil',\n",
    "    'amer': 'amir',\n",
    "    'amik': 'ambil',\n",
    "    'ana': 'saya',\n",
    "    'angkt': 'angkat',\n",
    "    'anual': 'tahunan',\n",
    "    'apapun': 'apa pun',\n",
    "    'ape': 'apa',\n",
    "    'arab': 'arab',\n",
    "    'area': 'kawasan',\n",
    "    'aritu': 'hari itu',\n",
    "    'ask': 'tanya',\n",
    "    'astro': 'astro',\n",
    "    'at': 'pada',\n",
    "    'attitude': 'sikap',\n",
    "    'babi': 'khinzir',\n",
    "    'back': 'belakang',\n",
    "    'bag': 'beg',\n",
    "    'bang': 'abang',\n",
    "    'bangla': 'bangladesh',\n",
    "    'banyk': 'banyak',\n",
    "    'bard': 'pujangga',\n",
    "    'bargasi': 'bagasi',\n",
    "    'bawak': 'bawa',\n",
    "    'bawanges': 'bawang',\n",
    "    'be': 'jadi',\n",
    "    'behave': 'berkelakuan baik',\n",
    "    'belagak': 'berlagak',\n",
    "    'berdisiplin': 'berdisplin',\n",
    "    'berenti': 'berhenti',\n",
    "    'beskal': 'basikal',\n",
    "    'bff': 'rakan karib',\n",
    "    'bg': 'bagi',\n",
    "    'bgi': 'bagi',\n",
    "    'biase': 'biasa',\n",
    "    'big': 'besar',\n",
    "    'bike': 'basikal',\n",
    "    'bile': 'bila',\n",
    "    'binawe': 'binatang',\n",
    "    'bini': 'isteri',\n",
    "    'bkn': 'bukan',\n",
    "    'bla': 'bila',\n",
    "    'blom': 'belum',\n",
    "    'bnyak': 'banyak',\n",
    "    'body': 'tubuh',\n",
    "    'bole': 'boleh',\n",
    "    'boss': 'bos',\n",
    "    'bowling': 'boling',\n",
    "    'bpe': 'berapa',\n",
    "    'brand': 'jenama',\n",
    "    'brg': 'barang',\n",
    "    'briefing': 'taklimat',\n",
    "    'brng': 'barang',\n",
    "    'bro': 'abang',\n",
    "    'bru': 'baru',\n",
    "    'bruntung': 'beruntung',\n",
    "    'bsikal': 'basikal',\n",
    "    'btnggjwb': 'bertanggungjawab',\n",
    "    'btul': 'betul',\n",
    "    'buatlh': 'buatlah',\n",
    "    'buh': 'letak',\n",
    "    'buka': 'buka',\n",
    "    'but': 'tetapi',\n",
    "    'bwk': 'bawa',\n",
    "    'by': 'dengan',\n",
    "    'byr': 'bayar',\n",
    "    'bz': 'sibuk',\n",
    "    'camera': 'kamera',\n",
    "    'camni': 'macam ini',\n",
    "    'cane': 'macam mana',\n",
    "    'cant': 'tak boleh',\n",
    "    'carakerja': 'cara kerja',\n",
    "    'care': 'jaga',\n",
    "    'cargo': 'kargo',\n",
    "    'cctv': 'kamera litar tertutup',\n",
    "    'celako': 'celaka',\n",
    "    'cer': 'cerita',\n",
    "    'cheap': 'murah',\n",
    "    'check': 'semak',\n",
    "    'ciput': 'sedikit',\n",
    "    'cite': 'cerita',\n",
    "    'citer': 'cerita',\n",
    "    'ckit': 'sikit',\n",
    "    'cikit': 'sikit',\n",
    "    'ckp': 'cakap',\n",
    "    'class': 'kelas',\n",
    "    'cm': 'macam',\n",
    "    'cmni': 'macam ini',\n",
    "    'cmpak': 'campak',\n",
    "    'committed': 'komited',\n",
    "    'company': 'syarikat',\n",
    "    'complain': 'aduan',\n",
    "    'corn': 'jagung',\n",
    "    'couldnt': 'tak boleh',\n",
    "    'cr': 'cari',\n",
    "    'crew': 'krew',\n",
    "    'cube': 'cuba',\n",
    "    'cuma': 'cuma',\n",
    "    'curinyaa': 'curinya',\n",
    "    'cust': 'pelanggan',\n",
    "    'customer': 'pelanggan',\n",
    "    'd': 'di',\n",
    "    'da': 'dah',\n",
    "    'dn': 'dan',\n",
    "    'dahh': 'dah',\n",
    "    'damaged': 'rosak',\n",
    "    'dapek': 'dapat',\n",
    "    'day': 'hari',\n",
    "    'dazrin': 'dazrin',\n",
    "    'dbalingnya': 'dibalingnya',\n",
    "    'de': 'ada',\n",
    "    'deep': 'dalam',\n",
    "    'deliberately': 'sengaja',\n",
    "    'depa': 'mereka',\n",
    "    'dessa': 'desa',\n",
    "    'dgn': 'dengan',\n",
    "    'dh': 'dah',\n",
    "    'didunia': 'di dunia',\n",
    "    'diorang': 'mereka',\n",
    "    'diorng': 'mereka',\n",
    "    'direct': 'secara terus',\n",
    "    'diving': 'junam',\n",
    "    'dkt': 'dekat',\n",
    "    'dlempar': 'dilempar',\n",
    "    'dlm': 'dalam',\n",
    "    'dlt': 'padam',\n",
    "    'dlu': 'dulu',\n",
    "    'done': 'siap',\n",
    "    'dont': 'jangan',\n",
    "    'dorg': 'mereka',\n",
    "    'dpermudhkn': 'dipermudahkan',\n",
    "    'dpt': 'dapat',\n",
    "    'dri': 'dari',\n",
    "    'dsb': 'dan sebagainya',\n",
    "    'dy': 'dia',\n",
    "    'educate': 'mendidik',\n",
    "    'ensure': 'memastikan',\n",
    "    'everything': 'semua',\n",
    "    'ewahh': 'wah',\n",
    "    'expect': 'sangka',\n",
    "    'fb': 'facebook',\n",
    "    'fired': 'pecat',\n",
    "    'first': 'pertama',\n",
    "    'fkr': 'fikir',\n",
    "    'flight': 'kapal terbang',\n",
    "    'for': 'untuk',\n",
    "    'free': 'percuma',\n",
    "    'friend': 'kawan',\n",
    "    'fyi': 'untuk pengetahuan anda',\n",
    "    'gantila': 'gantilah',\n",
    "    'gantirugi': 'ganti rugi',\n",
    "    'gentlemen': 'lelaki budiman',\n",
    "    'gerenti': 'jaminan',\n",
    "    'gile': 'gila',\n",
    "    'gk': 'juga',\n",
    "    'gnti': 'ganti',\n",
    "    'go': 'pergi',\n",
    "    'gomen': 'kerajaan',\n",
    "    'goment': 'kerajaan',\n",
    "    'good': 'baik',\n",
    "    'ground': 'tanah',\n",
    "    'guarno': 'macam mana',\n",
    "    'hampa': 'mereka',\n",
    "    'hampeh': 'teruk',\n",
    "    'hanat': 'jahanam',\n",
    "    'handle': 'kawal',\n",
    "    'handling': 'kawalan',\n",
    "    'hanta': 'hantar',\n",
    "    'haritu': 'hari itu',\n",
    "    'harini': 'hari ini',\n",
    "    'hate': 'benci',\n",
    "    'have': 'ada',\n",
    "    'hawau': 'celaka',\n",
    "    'henpon': 'telefon',\n",
    "    'heran': 'hairan',\n",
    "    'him': 'dia',\n",
    "    'his': 'dia',\n",
    "    'hmpa': 'mereka',\n",
    "    'hntr': 'hantar',\n",
    "    'hotak': 'otak',\n",
    "    'hr': 'hari',\n",
    "    'i': 'saya',\n",
    "    'hrga': 'harga',\n",
    "    'hrp': 'harap',\n",
    "    'hu': 'sedih',\n",
    "    'humble': 'merendah diri',\n",
    "    'ibon': 'ikon',\n",
    "    'ichi': 'inci',\n",
    "    'idung': 'hidung',\n",
    "    'if': 'jika',\n",
    "    'ig': 'instagram',\n",
    "    'iklas': 'ikhlas',\n",
    "    'improve': 'menambah baik',\n",
    "    'in': 'masuk',\n",
    "    'isn t': 'tidak',\n",
    "    'isyaallah': 'insyallah',\n",
    "    'ja': 'sahaja',\n",
    "    'japan': 'jepun',\n",
    "    'jd': 'jadi',\n",
    "    'saja': 'sahaja',\n",
    "    'saje': 'sahaja',\n",
    "    'je': 'sahaja',\n",
    "    'jee': 'sahaja',\n",
    "    'jek': 'sahaja',\n",
    "    'jepun': 'jepun',\n",
    "    'jer': 'sahaja',\n",
    "    'jerr': 'sahaja',\n",
    "    'jez': 'sahaja',\n",
    "    'jg': 'juga',\n",
    "    'jgk': 'juga',\n",
    "    'jgn': 'jangan',\n",
    "    'jgnla': 'janganlah',\n",
    "    'jibake': 'celaka',\n",
    "    'jjur': 'jujur',\n",
    "    'job': 'kerja',\n",
    "    'jobscope': 'skop kerja',\n",
    "    'jogja': 'jogjakarta',\n",
    "    'jpam': 'jpam',\n",
    "    'jth': 'jatuh',\n",
    "    'jugak': 'juga',\n",
    "    'ka': 'ke',\n",
    "    'kalo': 'kalau',\n",
    "    'kalu': 'kalau',\n",
    "    'kang': 'nanti',\n",
    "    'kantoi': 'temberang',\n",
    "    'kasi': 'beri',\n",
    "    'kat': 'dekat',\n",
    "    'kbye': 'ok bye',\n",
    "    'kearah': 'ke arah',\n",
    "    'kecik': 'kecil',\n",
    "    'keja': 'kerja',\n",
    "    'keje': 'kerja',\n",
    "    'kejo': 'kerja',\n",
    "    'keksongan': 'kekosongan',\n",
    "    'kemana': 'ke mana',\n",
    "    'kene': 'kena',\n",
    "    'kenekan': 'kenakan',\n",
    "    'kesah': 'kisah',\n",
    "    'ketempat': 'ke tempat',\n",
    "    'kije': 'kerja',\n",
    "    'kijo': 'kerja',\n",
    "    'kiss': 'cium',\n",
    "    'kite': 'kita',\n",
    "    'kito': 'kita',\n",
    "    'kje': 'kerja',\n",
    "    'kjr': 'kerja',\n",
    "    'kk': 'okay',\n",
    "    'kmi': 'kami',\n",
    "    'kt': 'kat',\n",
    "    'tlg': 'tolong',\n",
    "    'kl': 'kuala lumpur',\n",
    "    'klai': 'kalau',\n",
    "    'klau': 'kalau',\n",
    "    'klia': 'klia',\n",
    "    'klo': 'kalau',\n",
    "    'klu': 'kalau',\n",
    "    'kn': 'kan',\n",
    "    'knapa': 'kenapa',\n",
    "    'kne': 'kena',\n",
    "    'ko': 'kau',\n",
    "    'kompom': 'sah',\n",
    "    'korang': 'kamu semua',\n",
    "    'korea': 'korea',\n",
    "    'korg': 'kamu semua',\n",
    "    'kot': 'mungkin',\n",
    "    'krja': 'kerja',\n",
    "    'ksalahan': 'kesalahan',\n",
    "    'kta': 'kita',\n",
    "    'kuar': 'keluar',\n",
    "    'kut': 'mungkin',\n",
    "    'la': 'lah',\n",
    "    'laa': 'lah',\n",
    "    'lahabau': 'celaka',\n",
    "    'lahanat': 'celaka',\n",
    "    'lainda': 'lain dah',\n",
    "    'lak': 'pula',\n",
    "    'last': 'akhir',\n",
    "    'le': 'lah',\n",
    "    'leader': 'ketua',\n",
    "    'leave': 'pergi',\n",
    "    'ler': 'lah',\n",
    "    'less': 'kurang',\n",
    "    'letter': 'surat',\n",
    "    'lg': 'lagi',\n",
    "    'lgi': 'lagi',\n",
    "    'lngsong': 'langsung',\n",
    "    'lol': 'hehe',\n",
    "    'lorr': 'lah',\n",
    "    'low': 'rendah',\n",
    "    'lps': 'lepas',\n",
    "    'luggage': 'bagasi',\n",
    "    'lumbe': 'lumba',\n",
    "    'lyak': 'layak',\n",
    "    'maap': 'maaf',\n",
    "    'maapkan': 'maafkan',\n",
    "    'mahai': 'mahal',\n",
    "    'mampos': 'mampus',\n",
    "    'mart': 'kedai',\n",
    "    'mau': 'mahu',\n",
    "    'mcm': 'macam',\n",
    "    'mcmtu': 'macam itu',\n",
    "    'memerlukn': 'memerlukan',\n",
    "    'mengembirakan': 'menggembirakan',\n",
    "    'mengmbilnyer': 'mengambilnya',\n",
    "    'mengtasi': 'mengatasi',\n",
    "    'mg': 'memang',\n",
    "    'mihak': 'memihak',\n",
    "    'min': 'admin',\n",
    "    'mingu': 'minggu',\n",
    "    'mintak': 'minta',\n",
    "    'mjtuhkn': 'menjatuhkan',\n",
    "    'mkyong': 'mak yong',\n",
    "    'mlibatkn': 'melibatkan',\n",
    "    'mmg': 'memang',\n",
    "    'mmnjang': 'memanjang',\n",
    "    'mmpos': 'mampus',\n",
    "    'mn': 'mana',\n",
    "    'mna': 'mana',\n",
    "    'mntak': 'minta',\n",
    "    'mntk': 'minta',\n",
    "    'mnyusun': 'menyusun',\n",
    "    'mood': 'suasana',\n",
    "    'most': 'paling',\n",
    "    'mr': 'tuan',\n",
    "    'msa': 'masa',\n",
    "    'msia': 'malaysia',\n",
    "    'mst': 'mesti',\n",
    "    'mu': 'awak',\n",
    "    'much': 'banyak',\n",
    "    'muko': 'muka',\n",
    "    'mum': 'emak',\n",
    "    'n': 'dan',\n",
    "    'nah': 'nah',\n",
    "    'nanny': 'nenek',\n",
    "    'napo': 'kenapa',\n",
    "    'nati': 'nanti',\n",
    "    'ngan': 'dengan',\n",
    "    'ngn': 'dengan',\n",
    "    'ni': 'ini',\n",
    "    'nie': 'ini',\n",
    "    'nii': 'ini',\n",
    "    'nk': 'nak',\n",
    "    'nmpk': 'nampak',\n",
    "    'nye': 'nya',\n",
    "    'ofis': 'pejabat',\n",
    "    'ohh': 'oh',\n",
    "    'oii': 'hoi',\n",
    "    'one': 'satu',\n",
    "    'online': 'dalam talian',\n",
    "    'or': 'atau',\n",
    "    'org': 'orang',\n",
    "    'orng': 'orang',\n",
    "    'otek': 'otak',\n",
    "    'p': 'pergi',\n",
    "    'paid': 'dah bayar',\n",
    "    'palabana': 'kepala otak',\n",
    "    'pasni': 'lepas ini',\n",
    "    'passengers': 'penumpang',\n",
    "    'passengger': 'penumpang',\n",
    "    'pastu': 'lepas itu',\n",
    "    'pd': 'pada',\n",
    "    'pegi': 'pergi',\n",
    "    'pekerje': 'pekerja',\n",
    "    'pekrja': 'pekerja',\n",
    "    'perabih': 'perabis',\n",
    "    'perkerja': 'pekerja',\n",
    "    'pg': 'pergi',\n",
    "    'phuii': 'puih',\n",
    "    'pikir': 'fikir',\n",
    "    'pilot': 'juruterbang',\n",
    "    'pk': 'fikir',\n",
    "    'pkerja': 'pekerja',\n",
    "    'pkerjaan': 'pekerjaan',\n",
    "    'pki': 'pakai',\n",
    "    'please': 'tolong',\n",
    "    'pls': 'tolong',\n",
    "    'pn': 'pun',\n",
    "    'pnh': 'pernah',\n",
    "    'pnt': 'penat',\n",
    "    'pnya': 'punya',\n",
    "    'pon': 'pun',\n",
    "    'priority': 'keutamaan',\n",
    "    'properties': 'harta benda',\n",
    "    'ptugas': 'petugas',\n",
    "    'pub': 'kelab malam',\n",
    "    'pulak': 'pula',\n",
    "    'puye': 'punya',\n",
    "    'pwrcuma': 'percuma',\n",
    "    'pyahnya': 'payahnya',\n",
    "    'quality': 'kualiti',\n",
    "    'quit': 'keluar',\n",
    "    'ramly': 'ramly',\n",
    "    'rege': 'harga',\n",
    "    'reger': 'harga',\n",
    "    'report': 'laporan',\n",
    "    'resigned': 'meletakkan jawatan',\n",
    "    'respect': 'hormat',\n",
    "    'rizal': 'rizal',\n",
    "    'rosak': 'rosak',\n",
    "    'rosok': 'rosak',\n",
    "    'rse': 'rasa',\n",
    "    'sacked': 'buang',\n",
    "    'sado': 'tegap',\n",
    "    'salute': 'sanjung',\n",
    "    'sam': 'sama',\n",
    "    'same': 'sama',\n",
    "    'samp': 'sampah',\n",
    "    'sbb': 'sebab',\n",
    "    'sbgai': 'sebagai',\n",
    "    'sblm': 'sebelum',\n",
    "    'sblum': 'sebelum',\n",
    "    'sbnarnya': 'sebenarnya',\n",
    "    'sbum': 'sebelum',\n",
    "    'sdg': 'sedang',\n",
    "    'sebb': 'sebab',\n",
    "    'sebijik': 'sebiji',\n",
    "    'see': 'lihat',\n",
    "    'seen': 'dilihat',\n",
    "    'selangor': 'selangor',\n",
    "    'selfie': 'swafoto',\n",
    "    'sempoi': 'cantik',\n",
    "    'senaraihitam': 'senarai hitam',\n",
    "    'seorg': 'seorang',\n",
    "    'service': 'perkhidmatan',\n",
    "    'sgt': 'sangat',\n",
    "    'shared': 'kongsi',\n",
    "    'shirt': 'kemeja',\n",
    "    'shut': 'tutup',\n",
    "    'sib': 'nasib',\n",
    "    'skali': 'sekali',\n",
    "    'sket': 'sikit',\n",
    "    'sma': 'sama',\n",
    "    'smoga': 'semoga',\n",
    "    'smpoi': 'cantik',\n",
    "    'sndiri': 'sendiri',\n",
    "    'sndr': 'sendiri',\n",
    "    'sndri': 'sendiri',\n",
    "    'sne': 'sana',\n",
    "    'so': 'jadi',\n",
    "    'sop': 'tatacara pengendalian piawai',\n",
    "    'sorang': 'seorang',\n",
    "    'spoting': 'pembintikan',\n",
    "    'sronok': 'seronok',\n",
    "    'ssh': 'susah',\n",
    "    'staff': 'staf',\n",
    "    'standing': 'berdiri',\n",
    "    'start': 'mula',\n",
    "    'steady': 'mantap',\n",
    "    'stiap': 'setiap',\n",
    "    'stress': 'stres',\n",
    "    'student': 'pelajar',\n",
    "    'study': 'belajar',\n",
    "    'studycase': 'kajian kes',\n",
    "    'sure': 'pasti',\n",
    "    'sykt': 'syarikat',\n",
    "    'tah': 'entah',\n",
    "    'taik': 'tahi',\n",
    "    'takan': 'tak akan',\n",
    "    'takat': 'setakat',\n",
    "    'takde': 'tak ada',\n",
    "    'takkan': 'tak akan',\n",
    "    'taknak': 'tak nak',\n",
    "    'tang': 'tentang',\n",
    "    'tanggungjawab': 'bertanggungjawab',\n",
    "    'taraa': 'sementara',\n",
    "    'tau': 'tahu',\n",
    "    'tbabit': 'terbabit',\n",
    "    'team': 'pasukan',\n",
    "    'terbaekk': 'terbaik',\n",
    "    'teruknye': 'teruknya',\n",
    "    'tgk': 'tengok',\n",
    "    'that': 'itu',\n",
    "    'thinking': 'fikir',\n",
    "    'those': 'itu',\n",
    "    'time': 'masa',\n",
    "    'tk': 'tak',\n",
    "    'tnggongjwb': 'tanggungjawab',\n",
    "    'tngok': 'tengok',\n",
    "    'tngu': 'tunggu',\n",
    "    'to': 'kepada',\n",
    "    'tosak': 'rosak',\n",
    "    'tp': 'tapi',\n",
    "    'tpi': 'tapi',\n",
    "    'tpon': 'telefon',\n",
    "    'transfer': 'pindah',\n",
    "    'trgelak': 'tergelak',\n",
    "    'ts': 'tan sri',\n",
    "    'tstony': 'tan sri tony',\n",
    "    'tu': 'itu',\n",
    "    'tuh': 'itu',\n",
    "    'tula': 'itulah',\n",
    "    'umeno': 'umno',\n",
    "    'unfortunately': 'malangnya',\n",
    "    'unhappy': 'tidak gembira',\n",
    "    'up': 'naik',\n",
    "    'upkan': 'naikkan',\n",
    "    'ur': 'awak',\n",
    "    'utk': 'untuk',\n",
    "    'very': 'sangat',\n",
    "    'viral': 'tular',\n",
    "    'vote': 'undi',\n",
    "    'warning': 'amaran',\n",
    "    'warranty': 'waranti',\n",
    "    'wassap': 'whatsapp',\n",
    "    'wat': 'apa',\n",
    "    'weii': 'wei',\n",
    "    'well': 'maklumlah',\n",
    "    'win': 'menang',\n",
    "    'with': 'dengan',\n",
    "    'wt': 'buat',\n",
    "    'x': 'tak',\n",
    "    'tw': 'tahu',\n",
    "    'ye': 'ya',\n",
    "    'yee': 'ya',\n",
    "    'yg': 'yang',\n",
    "    'yng': 'yang',\n",
    "    'you': 'awak',\n",
    "    'your': 'awak',\n",
    "    'sakai': 'selekeh',\n",
    "    'rmb': 'billion ringgit',\n",
    "    'rmj': 'juta ringgit',\n",
    "    'rmk': 'ribu ringgit',\n",
    "    'rm': 'ringgit',\n",
    "    ':*': '<kiss>',\n",
    "    ':-*': '<kiss>',\n",
    "    ':x': '<kiss>',\n",
    "    ':-)': '<happy>',\n",
    "    ':-))': '<happy>',\n",
    "    ':-)))': '<happy>',\n",
    "    ':-))))': '<happy>',\n",
    "    ':-)))))': '<happy>',\n",
    "    ':-))))))': '<happy>',\n",
    "    ':)': '<happy>',\n",
    "    ':))': '<happy>',\n",
    "    ':)))': '<happy>',\n",
    "    ':))))': '<happy>',\n",
    "    ':)))))': '<happy>',\n",
    "    ':))))))': '<happy>',\n",
    "    ':)))))))': '<happy>',\n",
    "    ':o)': '<happy>',\n",
    "    ':]': '<happy>',\n",
    "    ':3': '<happy>',\n",
    "    ':c)': '<happy>',\n",
    "    ':>': '<happy>',\n",
    "    '=]': '<happy>',\n",
    "    '8)': '<happy>',\n",
    "    '=)': '<happy>',\n",
    "    ':}': '<happy>',\n",
    "    ':^)': '<happy>',\n",
    "    '|;-)': '<happy>',\n",
    "    \":'-)\": '<happy>',\n",
    "    \":')\": '<happy>',\n",
    "    '\\o/': '<happy>',\n",
    "    '*\\\\0/*': '<happy>',\n",
    "    ':-D': '<laugh>',\n",
    "    ':D': '<laugh>',\n",
    "    '8-D': '<laugh>',\n",
    "    '8D': '<laugh>',\n",
    "    'x-D': '<laugh>',\n",
    "    'xD': '<laugh>',\n",
    "    'X-D': '<laugh>',\n",
    "    'XD': '<laugh>',\n",
    "    '=-D': '<laugh>',\n",
    "    '=D': '<laugh>',\n",
    "    '=-3': '<laugh>',\n",
    "    '=3': '<laugh>',\n",
    "    'B^D': '<laugh>',\n",
    "    '>:[': '<sad>',\n",
    "    ':-(': '<sad>',\n",
    "    ':-((': '<sad>',\n",
    "    ':-(((': '<sad>',\n",
    "    ':-((((': '<sad>',\n",
    "    ':-(((((': '<sad>',\n",
    "    ':-((((((': '<sad>',\n",
    "    ':-(((((((': '<sad>',\n",
    "    ':(': '<sad>',\n",
    "    ':((': '<sad>',\n",
    "    ':(((': '<sad>',\n",
    "    ':((((': '<sad>',\n",
    "    ':(((((': '<sad>',\n",
    "    ':((((((': '<sad>',\n",
    "    ':(((((((': '<sad>',\n",
    "    ':((((((((': '<sad>',\n",
    "    ':-c': '<sad>',\n",
    "    ':c': '<sad>',\n",
    "    ':-<': '<sad>',\n",
    "    ':<': '<sad>',\n",
    "    ':-[': '<sad>',\n",
    "    ':[': '<sad>',\n",
    "    ':{': '<sad>',\n",
    "    ':-||': '<sad>',\n",
    "    ':@': '<sad>',\n",
    "    \":'-(\": '<sad>',\n",
    "    \":'(\": '<sad>',\n",
    "    'D:<': '<sad>',\n",
    "    'D:': '<sad>',\n",
    "    'D8': '<sad>',\n",
    "    'D;': '<sad>',\n",
    "    'D=': '<sad>',\n",
    "    'DX': '<sad>',\n",
    "    'v.v': '<sad>',\n",
    "    \"D-':\": '<sad>',\n",
    "    '(>_<)': '<sad>',\n",
    "    ':|': '<sad>',\n",
    "    '>:O': '<surprise>',\n",
    "    ':-O': '<surprise>',\n",
    "    ':-o': '<surprise>',\n",
    "    ':O': '<surprise>',\n",
    "    '°o°': '<surprise>',\n",
    "    'o_O': '<surprise>',\n",
    "    'o_0': '<surprise>',\n",
    "    'o.O': '<surprise>',\n",
    "    'o-o': '<surprise>',\n",
    "    '8-0': '<surprise>',\n",
    "    '|-O': '<surprise>',\n",
    "    ';-)': '<wink>',\n",
    "    ';)': '<wink>',\n",
    "    '*-)': '<wink>',\n",
    "    '*)': '<wink>',\n",
    "    ';-]': '<wink>',\n",
    "    ';]': '<wink>',\n",
    "    ';D': '<wink>',\n",
    "    ';^)': '<wink>',\n",
    "    ':-,': '<wink>',\n",
    "    '>:P': '<tong>',\n",
    "    ':-P': '<tong>',\n",
    "    ':P': '<tong>',\n",
    "    'X-P': '<tong>',\n",
    "    'x-p': '<tong>',\n",
    "    'xp': '<tong>',\n",
    "    'XP': '<tong>',\n",
    "    ':-p': '<tong>',\n",
    "    ':p': '<tong>',\n",
    "    '=p': '<tong>',\n",
    "    ':-Þ': '<tong>',\n",
    "    ':Þ': '<tong>',\n",
    "    ':-b': '<tong>',\n",
    "    ':b': '<tong>',\n",
    "    ':-&': '<tong>',\n",
    "    '>:\\\\': '<annoyed>',\n",
    "    '>:/': '<annoyed>',\n",
    "    ':-/': '<annoyed>',\n",
    "    ':-.': '<annoyed>',\n",
    "    ':/': '<annoyed>',\n",
    "    ':\\\\': '<annoyed>',\n",
    "    '=/': '<annoyed>',\n",
    "    '=\\\\': '<annoyed>',\n",
    "    ':L': '<annoyed>',\n",
    "    '=L': '<annoyed>',\n",
    "    ':S': '<annoyed>',\n",
    "    '>.<': '<annoyed>',\n",
    "    ':-|': '<annoyed>',\n",
    "    '<:-|': '<annoyed>',\n",
    "    ':-X': '<seallips>',\n",
    "    ':X': '<seallips>',\n",
    "    ':-#': '<seallips>',\n",
    "    ':#': '<seallips>',\n",
    "    'O:-)': '<angel>',\n",
    "    '0:-3': '<angel>',\n",
    "    '0:3': '<angel>',\n",
    "    '0:-)': '<angel>',\n",
    "    '0:)': '<angel>',\n",
    "    '0;^)': '<angel>',\n",
    "    '>:)': '<devil>',\n",
    "    '>:D': '<devil>',\n",
    "    '>:-D': '<devil>',\n",
    "    '>;)': '<devil>',\n",
    "    '>:-)': '<devil>',\n",
    "    '}:-)': '<devil>',\n",
    "    '}:)': '<devil>',\n",
    "    '3:-)': '<devil>',\n",
    "    '3:)': '<devil>',\n",
    "    'o/\\o': '<highfive>',\n",
    "    '^5': '<highfive>',\n",
    "    '>_>^': '<highfive>',\n",
    "    '^<_<': '<highfive>',\n",
    "    '<3': '<heart>',\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "from unidecode import unidecode\n",
    "from rules import normalized_chars\n",
    "\n",
    "permulaan = [\n",
    "    'bel',\n",
    "    'se',\n",
    "    'ter',\n",
    "    'men',\n",
    "    'meng',\n",
    "    'mem',\n",
    "    'memper',\n",
    "    'di',\n",
    "    'pe',\n",
    "    'me',\n",
    "    'ke',\n",
    "    'ber',\n",
    "    'pen',\n",
    "    'per',\n",
    "]\n",
    "\n",
    "hujung = ['kan', 'kah', 'lah', 'tah', 'nya', 'an', 'wan', 'wati', 'ita']\n",
    "\n",
    "laughing = {\n",
    "    'huhu',\n",
    "    'haha',\n",
    "    'gagaga',\n",
    "    'hihi',\n",
    "    'wkawka',\n",
    "    'wkwk',\n",
    "    'kiki',\n",
    "    'keke',\n",
    "    'huehue',\n",
    "    'hshs',\n",
    "    'hoho',\n",
    "    'hewhew',\n",
    "    'uwu',\n",
    "    'sksk',\n",
    "    'ksks',\n",
    "    'gituu',\n",
    "    'gitu',\n",
    "    'mmeeooww',\n",
    "    'meow',\n",
    "    'alhamdulillah',\n",
    "    'muah',\n",
    "    'mmuahh',\n",
    "    'hehe',\n",
    "    'salamramadhan',\n",
    "    'happywomensday',\n",
    "    'jahagaha',\n",
    "    'ahakss',\n",
    "    'ahksk'\n",
    "}\n",
    "\n",
    "def naive_stemmer(word):\n",
    "    assert isinstance(word, str), 'input must be a string'\n",
    "    hujung_result = [e for e in hujung if word.endswith(e)]\n",
    "    if len(hujung_result):\n",
    "        hujung_result = max(hujung_result, key = len)\n",
    "        if len(hujung_result):\n",
    "            word = word[: -len(hujung_result)]\n",
    "    permulaan_result = [e for e in permulaan if word.startswith(e)]\n",
    "    if len(permulaan_result):\n",
    "        permulaan_result = max(permulaan_result, key = len)\n",
    "        if len(permulaan_result):\n",
    "            word = word[len(permulaan_result) :]\n",
    "    return word\n",
    "\n",
    "def make_cleaning(s, c_dict):\n",
    "    s = s.translate(c_dict)\n",
    "    return s\n",
    "\n",
    "def cleaning(string):\n",
    "    string = unidecode(string)\n",
    "    \n",
    "    string = ' '.join(\n",
    "        [make_cleaning(w, normalized_chars) for w in string.split()]\n",
    "    )\n",
    "    string = re.sub('\\(dot\\)', '.', string)\n",
    "    string = (\n",
    "        re.sub(re.findall(r'\\<a(.*?)\\>', string)[0], '', string)\n",
    "        if (len(re.findall(r'\\<a (.*?)\\>', string)) > 0)\n",
    "        and ('href' in re.findall(r'\\<a (.*?)\\>', string)[0])\n",
    "        else string\n",
    "    )\n",
    "    string = re.sub(\n",
    "        r'\\w+:\\/{2}[\\d\\w-]+(\\.[\\d\\w-]+)*(?:(?:\\/[^\\s/]*))*', ' ', string\n",
    "    )\n",
    "    \n",
    "    chars = '.,/'\n",
    "    for c in chars:\n",
    "        string = string.replace(c, f' {c} ')\n",
    "        \n",
    "    string = re.sub(r'[ ]+', ' ', string).strip()\n",
    "    string = [rules_normalizer.get(w, w) for w in string.split()]\n",
    "    string = [naive_stemmer(word) for word in string]\n",
    "    string = [w for w in string if len(w)]\n",
    "    string = [w for w in string if w[0] != '@']\n",
    "    x = []\n",
    "    for word in string:\n",
    "        word = word.lower()\n",
    "        if any([laugh in word for laugh in laughing]):\n",
    "            if random.random() >= 0.5:\n",
    "                x.append(word)\n",
    "        else:\n",
    "            x.append(word)\n",
    "    return ' '.join(x).lower()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 7639745/7639745 [09:22<00:00, 13580.32it/s]\n"
     ]
    }
   ],
   "source": [
    "from tqdm import tqdm\n",
    "import re\n",
    "\n",
    "for i in tqdm(range(len(texts))):\n",
    "    texts[i] = cleaning(texts[i])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 7639745/7639745 [00:04<00:00, 1551361.62it/s]\n"
     ]
    }
   ],
   "source": [
    "actual_t, actual_l = [], []\n",
    "\n",
    "for i in tqdm(range(len(texts))):\n",
    "    if len(texts[i]) > 2:\n",
    "        actual_t.append(texts[i])\n",
    "        actual_l.append(Y[i])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('combined.txt', 'w') as fopen:\n",
    "    fopen.write('\\n'.join(actual_t))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "import youtokentome as yttm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 23.5 s, sys: 5.36 s, total: 28.8 s\n",
      "Wall time: 7.14 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "bpe = yttm.BPE.train(data='combined.txt', \n",
    "               vocab_size=60000, model='nsfw.model')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "60000"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vocab = {v: i for i, v in enumerate(bpe.vocab())}\n",
    "rev_vocab = {i: v for i, v in enumerate(bpe.vocab())}\n",
    "len(vocab)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "r = re.compile(r'[\\S]+').findall"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'▁bapa ▁rogol ▁anak ▁jepun ▁ks ▁lucah ▁, ▁ks ▁video-video ▁, ▁fuck ▁. ▁. ▁. ▁- ▁menawan!'"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "subs = [' '.join(s) for s in bpe.encode(actual_t, output_type=yttm.OutputType.SUBWORD)]\n",
    "subs[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "tfidf = TfidfVectorizer(vocabulary = vocab, token_pattern = r'[\\S]+').fit(subs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "with open('tfidf-nsfw.pkl','wb') as fopen:\n",
    "    pickle.dump(tfidf,fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "vector = tfidf.transform(subs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((6090043, 60000), (1522511, 60000))"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "train_X, test_X, train_Y, test_Y = train_test_split(vector, actual_l, test_size = 0.2)\n",
    "train_X.shape, test_X.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.naive_bayes import ComplementNB"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "multinomial = ComplementNB().fit(train_X, train_Y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "         sex       0.90      0.97      0.93   1106227\n",
      "    gambling       0.84      0.99      0.90    204641\n",
      "    negative       0.99      0.97      0.98   4779175\n",
      "\n",
      "    accuracy                           0.97   6090043\n",
      "   macro avg       0.91      0.97      0.94   6090043\n",
      "weighted avg       0.97      0.97      0.97   6090043\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from sklearn import metrics\n",
    "\n",
    "print(\n",
    "    metrics.classification_report(\n",
    "        train_Y,\n",
    "        multinomial.predict(train_X),\n",
    "        target_names = labels,\n",
    "        \n",
    "    )\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "         sex    0.89496   0.96479   0.92857    277203\n",
      "    gambling    0.83471   0.98683   0.90442     51329\n",
      "    negative    0.99105   0.96534   0.97803   1193979\n",
      "\n",
      "    accuracy                        0.96596   1522511\n",
      "   macro avg    0.90691   0.97232   0.93700   1522511\n",
      "weighted avg    0.96829   0.96596   0.96654   1522511\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(\n",
    "    metrics.classification_report(\n",
    "        test_Y,\n",
    "        multinomial.predict(test_X),\n",
    "        target_names = labels,\n",
    "        digits = 5\n",
    "    )\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('multinomial-nsfw.pkl','wb') as fopen:\n",
    "    pickle.dump(multinomial,fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
